---
title: "Data preparation of: Distinctive Voices: Political Speech, Rhetoric and the Substantive Representation of Women in European Parliaments"
author: "Jens Wäckerle and Bruno Castanho Silva"
date: \today
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message=F, warning=F)
```

This script simply runs the Machine Learning models that were used to produce the predicted probabilities used in the models in the paper and Appendix. They are already contained in the `df.all` dataset, inside the `df_analysis.RData` file. The only purpose of this script is to re-generate those predicted probabilities to verify they are reproducible.

Attention, it takes 48-72 hours to run this entire script!

## Data and packages

```{r}
library(tidyverse)
library(quanteda)
library(glmnet)
library(magrittr)
library(purrr)
library(furrr)
library(xgboost)

load('df_analysis.RData')
```

## Functions to run ML in each dfm

```{r}
run_models <- function(i, df, legislatures,language) {
  require(quanteda)
  require(tidyverse)
  require(xgboost)
  require(glmnet)
  
  df<- dfm_subset(df, leg == legislatures[i])
  docvars(df)$female <- ifelse(docvars(df)$gender == 'female',1,0)
  outcome.all <- docvars(df)$female
  
  weights <- length(docvars(df)$female)/(2*table(docvars(df)$female))[as.character(docvars(df)$female)]
  
  set.seed(123)
  
  glmnet.fit<-cv.glmnet(df, 
                        outcome.all, 
                        family='binomial', 
                        nfold=20,
                        alpha = 0,
                        keep=T,
                        weights=weights)
  pred.ridge <- data.frame(predict(glmnet.fit, newx = df, 
                                   s=glmnet.fit$lambda.min, 
                                   type = 'response'))
  colnames(pred.ridge)<-'pred.glmnet'
  
  pred.oos <- data.frame(pred.oos = glmnet.fit$fit.preval[,which(glmnet.fit$lambda == glmnet.fit$lambda.min)])
    pred.oos$pred.oos <- 1 / (1 + exp(-pred.oos$pred.oos))
    names(pred.oos) <- 'pred.oos'
    
  
  set.seed(123)
  # Cross-validate boosting:
  df.boost<-tibble(etas = seq(0.1,0.9,0.2),
             depths = seq(1,10,2)) %>%
    tidyr::expand(etas,depths) # create a data frame with all combinations of values to try
  
  out.boost.cv <- future_map(c(1:nrow(df.boost)), ~run_boost(.x, df, outcome.all, 
                                                             df.boost$etas,
                                                             df.boost$depths))

  df.boost <- cbind(df.boost, bind_rows(out.boost.cv))
  
  best.boost <- df.boost %>% filter(., test_logloss_mean == min(test_logloss_mean))
  
  boost.fit<-xgboost(data=df, label=outcome.all,
                     max.depth=best.boost$depths,
                     eta = best.boost$etas,
                     nrounds=best.boost$ntrees,
                     objective='binary:logistic',
                     verbose=0,
                     weight = weights,
                     eval_metric = 'logloss')
  
  pred.boost<-data.frame(predict(boost.fit, newdata = df))
  names(pred.boost)<-'pred.boost'
  
  ## OOS Boosting predictions:
   boost.fitcv<-xgb.cv(data=df, label=outcome.all,
                       max.depth=best.boost$depths,
                       eta = best.boost$etas,
                       nrounds=best.boost$ntrees,
                       objective='binary:logistic',
                       verbose=0,
                       weight = weights,
                       eval_metric = 'logloss',
                      nfold = 5,
                      prediction = T)
    pred.boost.oos <- data.frame(boost.fitcv$pred)
    names(pred.boost.oos) <- 'pred.boost.oos'
    
  
    out <- data.frame(pred.ridge = pred.ridge, pred.boost = pred.boost, pred.oos = pred.oos, pred.boost.oos = pred.boost.oos)
    
    
  return(out)
}

# Function to run the boosting algorithm for cross-validation, used in the function above;

run_boost <- function(i,df, outcome,  eta.n, depth.n) {
  require(xgboost)
  eta = eta.n[i]
  max.depth = depth.n[i]
  fit <- xgb.cv(data = df, label = outcome,
                 max.depth = max.depth,
                 eta = eta,
                 nrounds = 300, 
                 objective = 'binary:logistic',
                 verbose = 0,
                 weights = weights,
                nfold = 5,
                eval_metric = 'logloss')
  
  min.logloss <- fit$evaluation_log[
    which.min(fit$evaluation_log$test_logloss_mean),
    'test_logloss_mean']
  ntrees <- which.min(fit$evaluation_log$test_logloss_mean)
  out <- data.frame(min.logloss = min.logloss, ntrees = ntrees)
  return(out)
}
  
```


# Germany

```{r}
legislatures <- unique(docvars(dfm.de)$leg)

germany.bygov <- future_map(c(1:length(legislatures)), ~run_models(.x, dfm.de, legislatures = legislatures, language='german'),
                            options = furrr_options(seed = 123))

germany.bygov.df <- bind_rows(germany.bygov)

germany.bygov.df$docid <- rownames(germany.bygov.df)
```

# Netherlands

```{r}
legislatures <- unique(docvars(dfm.nl)$leg)

nl.bygov <- future_map(c(1:length(legislatures)), ~run_models(.x, dfm.nl, legislatures = legislatures, language = 'dutch'),
                       options = furrr_options(seed = 123))

nl.bygov.df <- bind_rows(nl.bygov)

nl.bygov.df$docid <- rownames(nl.bygov.df)
```

# Sweden

```{r}
legislatures <- unique(docvars(dfm.se)$leg)

se.bygov <- future_map(c(1:length(legislatures)), ~run_models(.x, dfm.se, legislatures = legislatures, language = 'swedish'),
                       options = furrr_options(seed = 123))

se.bygov.df <- bind_rows(se.bygov)

se.bygov.df$docid <- rownames(se.bygov.df)
```

# Spain

```{r}
legislatures <- unique(docvars(dfm.es)$leg)

spain.bygov <- future_map(c(1:length(legislatures)), ~run_models(.x, dfm.es, legislatures = legislatures, 
                                                                 language = 'spanish'),
                       options = furrr_options(seed = 123))
spain.bygov.df <- bind_rows(spain.bygov)

spain.bygov.df$docid <- rownames(spain.bygov.df)
```

# Ireland

```{r}
legislatures <- unique(docvars(dfm.ie)$leg)

ie.bygov <- future_map(c(1:length(legislatures)), ~run_models(.x, dfm.ie, legislatures = legislatures, 'english'),
                       options = furrr_options(seed = 123))

ie.bygov.df <- bind_rows(ie.bygov)

ie.bygov.df$docid <- rownames(ie.bygov.df)
```

# Check results match

The next lines combine all predictions and compare the estimates obtained to those in the main dataset used for the analysis, showing they are exactly the same:

```{r}
ie.bygov.df$country_label <- 'Ireland'
nl.bygov.df$country_label <- 'Netherlands'
spain.bygov.df$country_label <- 'Spain'
germany.bygov.df$country_label <- 'Germany'
se.bygov.df$country_label <- 'Sweden'

df.combined <- bind_rows(ie.bygov.df, spain.bygov.df, se.bygov.df, nl.bygov.df, germany.bygov.df)

df.all <- left_join(df.all, df.combined, by = c('country_label','docid'))

cor(df.all$pred.boost.oos.x,df.all$pred.boost.oos.y)
cor(df.all$pred.oos.x,df.all$pred.oos.y)
cor(df.all$pred.glmnet.x,df.all$pred.glmnet.y)
cor(df.all$pred.boost.oos.x,df.all$pred.boost.oos.y)
```